import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
Using TensorFlow backend.
dataset_train=pd.read_csv('PM_train.txt',sep=' ',header=None).drop([26,27],axis=1)
col_names = ['id','cycle','setting1','setting2','setting3','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11','s12','s13','s14','s15','s16','s17','s18','s19','s20','s21']
dataset_train.columns=col_names
print('Shape of Train dataset: ',dataset_train.shape)
dataset_train.head()
Shape of Train dataset: (20631, 26)
| id | cycle | setting1 | setting2 | setting3 | s1 | s2 | s3 | s4 | s5 | ... | s12 | s13 | s14 | s15 | s16 | s17 | s18 | s19 | s20 | s21 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | -0.0007 | -0.0004 | 100.0 | 518.67 | 641.82 | 1589.70 | 1400.60 | 14.62 | ... | 521.66 | 2388.02 | 8138.62 | 8.4195 | 0.03 | 392 | 2388 | 100.0 | 39.06 | 23.4190 |
| 1 | 1 | 2 | 0.0019 | -0.0003 | 100.0 | 518.67 | 642.15 | 1591.82 | 1403.14 | 14.62 | ... | 522.28 | 2388.07 | 8131.49 | 8.4318 | 0.03 | 392 | 2388 | 100.0 | 39.00 | 23.4236 |
| 2 | 1 | 3 | -0.0043 | 0.0003 | 100.0 | 518.67 | 642.35 | 1587.99 | 1404.20 | 14.62 | ... | 522.42 | 2388.03 | 8133.23 | 8.4178 | 0.03 | 390 | 2388 | 100.0 | 38.95 | 23.3442 |
| 3 | 1 | 4 | 0.0007 | 0.0000 | 100.0 | 518.67 | 642.35 | 1582.79 | 1401.87 | 14.62 | ... | 522.86 | 2388.08 | 8133.83 | 8.3682 | 0.03 | 392 | 2388 | 100.0 | 38.88 | 23.3739 |
| 4 | 1 | 5 | -0.0019 | -0.0002 | 100.0 | 518.67 | 642.37 | 1582.85 | 1406.22 | 14.62 | ... | 522.19 | 2388.04 | 8133.80 | 8.4294 | 0.03 | 393 | 2388 | 100.0 | 38.90 | 23.4044 |
5 rows × 26 columns
dataset_train['id'].value_counts()
69 362
92 341
96 336
67 313
83 293
...
24 147
57 137
70 137
91 135
39 128
Name: id, Length: 100, dtype: int64
dataset_test=pd.read_csv('PM_test.txt',sep=' ',header=None).drop([26,27],axis=1)
dataset_test.columns=col_names
#dataset_test.head()
print('Shape of Test dataset: ',dataset_train.shape)
dataset_train.head()
Shape of Test dataset: (20631, 26)
| id | cycle | setting1 | setting2 | setting3 | s1 | s2 | s3 | s4 | s5 | ... | s12 | s13 | s14 | s15 | s16 | s17 | s18 | s19 | s20 | s21 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | -0.0007 | -0.0004 | 100.0 | 518.67 | 641.82 | 1589.70 | 1400.60 | 14.62 | ... | 521.66 | 2388.02 | 8138.62 | 8.4195 | 0.03 | 392 | 2388 | 100.0 | 39.06 | 23.4190 |
| 1 | 1 | 2 | 0.0019 | -0.0003 | 100.0 | 518.67 | 642.15 | 1591.82 | 1403.14 | 14.62 | ... | 522.28 | 2388.07 | 8131.49 | 8.4318 | 0.03 | 392 | 2388 | 100.0 | 39.00 | 23.4236 |
| 2 | 1 | 3 | -0.0043 | 0.0003 | 100.0 | 518.67 | 642.35 | 1587.99 | 1404.20 | 14.62 | ... | 522.42 | 2388.03 | 8133.23 | 8.4178 | 0.03 | 390 | 2388 | 100.0 | 38.95 | 23.3442 |
| 3 | 1 | 4 | 0.0007 | 0.0000 | 100.0 | 518.67 | 642.35 | 1582.79 | 1401.87 | 14.62 | ... | 522.86 | 2388.08 | 8133.83 | 8.3682 | 0.03 | 392 | 2388 | 100.0 | 38.88 | 23.3739 |
| 4 | 1 | 5 | -0.0019 | -0.0002 | 100.0 | 518.67 | 642.37 | 1582.85 | 1406.22 | 14.62 | ... | 522.19 | 2388.04 | 8133.80 | 8.4294 | 0.03 | 393 | 2388 | 100.0 | 38.90 | 23.4044 |
5 rows × 26 columns
pm_truth=pd.read_csv('PM_truth.txt',sep=' ',header=None).drop([1],axis=1)
pm_truth.columns=['more']
pm_truth['id']=pm_truth.index+1
pm_truth.head()
| more | id | |
|---|---|---|
| 0 | 112 | 1 |
| 1 | 98 | 2 |
| 2 | 69 | 3 |
| 3 | 82 | 4 |
| 4 | 91 | 5 |
pm_truth.shape
(100, 2)
rul = pd.DataFrame(dataset_test.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
rul.head()
| id | max | |
|---|---|---|
| 0 | 1 | 31 |
| 1 | 2 | 49 |
| 2 | 3 | 126 |
| 3 | 4 | 106 |
| 4 | 5 | 98 |
rul.shape
(100, 2)
pm_truth['rtf']=pm_truth['more']+rul['max']
pm_truth.head()
| more | id | rtf | |
|---|---|---|---|
| 0 | 112 | 1 | 143 |
| 1 | 98 | 2 | 147 |
| 2 | 69 | 3 | 195 |
| 3 | 82 | 4 | 188 |
| 4 | 91 | 5 | 189 |
pm_truth.shape
(100, 3)
pm_truth.drop('more', axis=1, inplace=True)
dataset_test=dataset_test.merge(pm_truth,on=['id'],how='left')
dataset_test['ttf']=dataset_test['rtf'] - dataset_test['cycle']
dataset_test.drop('rtf', axis=1, inplace=True)
dataset_test.head()
| id | cycle | setting1 | setting2 | setting3 | s1 | s2 | s3 | s4 | s5 | ... | s13 | s14 | s15 | s16 | s17 | s18 | s19 | s20 | s21 | ttf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.0023 | 0.0003 | 100.0 | 518.67 | 643.02 | 1585.29 | 1398.21 | 14.62 | ... | 2388.03 | 8125.55 | 8.4052 | 0.03 | 392 | 2388 | 100.0 | 38.86 | 23.3735 | 142 |
| 1 | 1 | 2 | -0.0027 | -0.0003 | 100.0 | 518.67 | 641.71 | 1588.45 | 1395.42 | 14.62 | ... | 2388.06 | 8139.62 | 8.3803 | 0.03 | 393 | 2388 | 100.0 | 39.02 | 23.3916 | 141 |
| 2 | 1 | 3 | 0.0003 | 0.0001 | 100.0 | 518.67 | 642.46 | 1586.94 | 1401.34 | 14.62 | ... | 2388.03 | 8130.10 | 8.4441 | 0.03 | 393 | 2388 | 100.0 | 39.08 | 23.4166 | 140 |
| 3 | 1 | 4 | 0.0042 | 0.0000 | 100.0 | 518.67 | 642.44 | 1584.12 | 1406.42 | 14.62 | ... | 2388.05 | 8132.90 | 8.3917 | 0.03 | 391 | 2388 | 100.0 | 39.00 | 23.3737 | 139 |
| 4 | 1 | 5 | 0.0014 | 0.0000 | 100.0 | 518.67 | 642.51 | 1587.19 | 1401.92 | 14.62 | ... | 2388.03 | 8129.54 | 8.4031 | 0.03 | 390 | 2388 | 100.0 | 38.99 | 23.4130 | 138 |
5 rows × 27 columns
dataset_test.shape
(13096, 27)
dataset_train['ttf'] = dataset_train.groupby(['id'])['cycle'].transform(max)-dataset_train['cycle']
dataset_train.head()
| id | cycle | setting1 | setting2 | setting3 | s1 | s2 | s3 | s4 | s5 | ... | s13 | s14 | s15 | s16 | s17 | s18 | s19 | s20 | s21 | ttf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | -0.0007 | -0.0004 | 100.0 | 518.67 | 641.82 | 1589.70 | 1400.60 | 14.62 | ... | 2388.02 | 8138.62 | 8.4195 | 0.03 | 392 | 2388 | 100.0 | 39.06 | 23.4190 | 191 |
| 1 | 1 | 2 | 0.0019 | -0.0003 | 100.0 | 518.67 | 642.15 | 1591.82 | 1403.14 | 14.62 | ... | 2388.07 | 8131.49 | 8.4318 | 0.03 | 392 | 2388 | 100.0 | 39.00 | 23.4236 | 190 |
| 2 | 1 | 3 | -0.0043 | 0.0003 | 100.0 | 518.67 | 642.35 | 1587.99 | 1404.20 | 14.62 | ... | 2388.03 | 8133.23 | 8.4178 | 0.03 | 390 | 2388 | 100.0 | 38.95 | 23.3442 | 189 |
| 3 | 1 | 4 | 0.0007 | 0.0000 | 100.0 | 518.67 | 642.35 | 1582.79 | 1401.87 | 14.62 | ... | 2388.08 | 8133.83 | 8.3682 | 0.03 | 392 | 2388 | 100.0 | 38.88 | 23.3739 | 188 |
| 4 | 1 | 5 | -0.0019 | -0.0002 | 100.0 | 518.67 | 642.37 | 1582.85 | 1406.22 | 14.62 | ... | 2388.04 | 8133.80 | 8.4294 | 0.03 | 393 | 2388 | 100.0 | 38.90 | 23.4044 | 187 |
5 rows × 27 columns
dataset_train.shape
(20631, 27)
dataset_train['ttf'].value_counts()
0 100
123 100
121 100
89 100
73 100
...
341 1
356 1
355 1
354 1
351 1
Name: ttf, Length: 362, dtype: int64
df_train=dataset_train.copy()
df_test=dataset_test.copy()
period=30
df_train['label_bc'] = df_train['ttf'].apply(lambda x: 1 if x <= period else 0)
df_test['label_bc'] = df_test['ttf'].apply(lambda x: 1 if x <= period else 0)
df_train.head()
| id | cycle | setting1 | setting2 | setting3 | s1 | s2 | s3 | s4 | s5 | ... | s14 | s15 | s16 | s17 | s18 | s19 | s20 | s21 | ttf | label_bc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | -0.0007 | -0.0004 | 100.0 | 518.67 | 641.82 | 1589.70 | 1400.60 | 14.62 | ... | 8138.62 | 8.4195 | 0.03 | 392 | 2388 | 100.0 | 39.06 | 23.4190 | 191 | 0 |
| 1 | 1 | 2 | 0.0019 | -0.0003 | 100.0 | 518.67 | 642.15 | 1591.82 | 1403.14 | 14.62 | ... | 8131.49 | 8.4318 | 0.03 | 392 | 2388 | 100.0 | 39.00 | 23.4236 | 190 | 0 |
| 2 | 1 | 3 | -0.0043 | 0.0003 | 100.0 | 518.67 | 642.35 | 1587.99 | 1404.20 | 14.62 | ... | 8133.23 | 8.4178 | 0.03 | 390 | 2388 | 100.0 | 38.95 | 23.3442 | 189 | 0 |
| 3 | 1 | 4 | 0.0007 | 0.0000 | 100.0 | 518.67 | 642.35 | 1582.79 | 1401.87 | 14.62 | ... | 8133.83 | 8.3682 | 0.03 | 392 | 2388 | 100.0 | 38.88 | 23.3739 | 188 | 0 |
| 4 | 1 | 5 | -0.0019 | -0.0002 | 100.0 | 518.67 | 642.37 | 1582.85 | 1406.22 | 14.62 | ... | 8133.80 | 8.4294 | 0.03 | 393 | 2388 | 100.0 | 38.90 | 23.4044 | 187 | 0 |
5 rows × 28 columns
df_train['label_bc'].value_counts()
0 17531 1 3100 Name: label_bc, dtype: int64
features_col_name=['setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11',
's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
target_col_name='label_bc'
sc=MinMaxScaler()
df_train[features_col_name]=sc.fit_transform(df_train[features_col_name])
df_test[features_col_name]=sc.transform(df_test[features_col_name])
df_train.head()
| id | cycle | setting1 | setting2 | setting3 | s1 | s2 | s3 | s4 | s5 | ... | s14 | s15 | s16 | s17 | s18 | s19 | s20 | s21 | ttf | label_bc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.459770 | 0.166667 | 0.0 | 0.0 | 0.183735 | 0.406802 | 0.309757 | 0.0 | ... | 0.199608 | 0.363986 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.713178 | 0.724662 | 191 | 0 |
| 1 | 1 | 2 | 0.609195 | 0.250000 | 0.0 | 0.0 | 0.283133 | 0.453019 | 0.352633 | 0.0 | ... | 0.162813 | 0.411312 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.666667 | 0.731014 | 190 | 0 |
| 2 | 1 | 3 | 0.252874 | 0.750000 | 0.0 | 0.0 | 0.343373 | 0.369523 | 0.370527 | 0.0 | ... | 0.171793 | 0.357445 | 0.0 | 0.166667 | 0.0 | 0.0 | 0.627907 | 0.621375 | 189 | 0 |
| 3 | 1 | 4 | 0.540230 | 0.500000 | 0.0 | 0.0 | 0.343373 | 0.256159 | 0.331195 | 0.0 | ... | 0.174889 | 0.166603 | 0.0 | 0.333333 | 0.0 | 0.0 | 0.573643 | 0.662386 | 188 | 0 |
| 4 | 1 | 5 | 0.390805 | 0.333333 | 0.0 | 0.0 | 0.349398 | 0.257467 | 0.404625 | 0.0 | ... | 0.174734 | 0.402078 | 0.0 | 0.416667 | 0.0 | 0.0 | 0.589147 | 0.704502 | 187 | 0 |
5 rows × 28 columns
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20631 entries, 0 to 20630 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 20631 non-null int64 1 cycle 20631 non-null int64 2 setting1 20631 non-null float64 3 setting2 20631 non-null float64 4 setting3 20631 non-null float64 5 s1 20631 non-null float64 6 s2 20631 non-null float64 7 s3 20631 non-null float64 8 s4 20631 non-null float64 9 s5 20631 non-null float64 10 s6 20631 non-null float64 11 s7 20631 non-null float64 12 s8 20631 non-null float64 13 s9 20631 non-null float64 14 s10 20631 non-null float64 15 s11 20631 non-null float64 16 s12 20631 non-null float64 17 s13 20631 non-null float64 18 s14 20631 non-null float64 19 s15 20631 non-null float64 20 s16 20631 non-null float64 21 s17 20631 non-null float64 22 s18 20631 non-null float64 23 s19 20631 non-null float64 24 s20 20631 non-null float64 25 s21 20631 non-null float64 26 ttf 20631 non-null int64 27 label_bc 20631 non-null int64 dtypes: float64(24), int64(4) memory usage: 4.4 MB
df_train['ttf'].min()
0
df_train['ttf'].max()
361
df_train.iloc[0,:]
id 1.000000 cycle 1.000000 setting1 0.459770 setting2 0.166667 setting3 0.000000 s1 0.000000 s2 0.183735 s3 0.406802 s4 0.309757 s5 0.000000 s6 1.000000 s7 0.726248 s8 0.242424 s9 0.109755 s10 0.000000 s11 0.369048 s12 0.633262 s13 0.205882 s14 0.199608 s15 0.363986 s16 0.000000 s17 0.333333 s18 0.000000 s19 0.000000 s20 0.713178 s21 0.724662 ttf 191.000000 label_bc 0.000000 Name: 0, dtype: float64
import seaborn as sns
sns.pairplot(df_train)
<seaborn.axisgrid.PairGrid at 0x1e9af779978>